#### Load libraries
library(dplyr) # Used for data manipulation
library(knitr) # Used for R-Markdown knitting
library(kableExtra) # Used for Kable Tables
library(plotly) # Used for plotting
library(lubridate) #Used for Date manipulation
library(randomForest) # Used for building Random Forest models
library(rpart) # Used for building CART models
library(rpart.plot) # Used for plotting tree
#### Load Data
crashes_df <- read.csv('./Motor_Vehicle_Collisions_-_Crashes.csv', stringsAsFactors = FALSE) %>%
mutate(CRASH.DATE = as.Date(CRASH.DATE, "%m/%d/%Y")) #1,896,229 x 29
# crashes_df
# min(crashes_df$CRASH.DATE) #"2012-07-01"
# max(crashes_df$CRASH.DATE) #"2022-05-29
kable(t(summary(crashes_df))) %>% kable_classic(full_width = TRUE, html_font = "Cambria", font_size = 14)
| CRASH.DATE | Min. :2012-07-01 | 1st Qu.:2014-10-28 | Median :2016-12-15 | Mean :2017-01-01 | 3rd Qu.:2019-01-04 | Max. :2022-05-29 | NA |
| CRASH.TIME | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| BOROUGH | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| ZIP.CODE | Min. :10000 | 1st Qu.:10306 | Median :11207 | Mean :10837 | 3rd Qu.:11237 | Max. :11697 | NA’s :587695 |
| LATITUDE | Min. : 0.00 | 1st Qu.:40.67 | Median :40.72 | Mean :40.64 | 3rd Qu.:40.77 | Max. :43.34 | NA’s :220042 |
| LONGITUDE | Min. :-201.36 | 1st Qu.: -73.98 | Median : -73.93 | Mean : -73.77 | 3rd Qu.: -73.87 | Max. : 0.00 | NA’s :220042 |
| LOCATION | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| ON.STREET.NAME | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| CROSS.STREET.NAME | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| OFF.STREET.NAME | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| NUMBER.OF.PERSONS.INJURED | Min. : 0.0000 | 1st Qu.: 0.0000 | Median : 0.0000 | Mean : 0.2873 | 3rd Qu.: 0.0000 | Max. :43.0000 | NA’s :18 |
| NUMBER.OF.PERSONS.KILLED | Min. :0.000000 | 1st Qu.:0.000000 | Median :0.000000 | Mean :0.001358 | 3rd Qu.:0.000000 | Max. :8.000000 | NA’s :31 |
| NUMBER.OF.PEDESTRIANS.INJURED | Min. : 0.00000 | 1st Qu.: 0.00000 | Median : 0.00000 | Mean : 0.05304 | 3rd Qu.: 0.00000 | Max. :27.00000 | NA |
| NUMBER.OF.PEDESTRIANS.KILLED | Min. :0.000000 | 1st Qu.:0.000000 | Median :0.000000 | Mean :0.000697 | 3rd Qu.:0.000000 | Max. :6.000000 | NA |
| NUMBER.OF.CYCLIST.INJURED | Min. :0.00000 | 1st Qu.:0.00000 | Median :0.00000 | Mean :0.02435 | 3rd Qu.:0.00000 | Max. :4.00000 | NA |
| NUMBER.OF.CYCLIST.KILLED | Min. :0.0000000 | 1st Qu.:0.0000000 | Median :0.0000000 | Mean :0.0001007 | 3rd Qu.:0.0000000 | Max. :2.0000000 | NA |
| NUMBER.OF.MOTORIST.INJURED | Min. : 0.0000 | 1st Qu.: 0.0000 | Median : 0.0000 | Mean : 0.2083 | 3rd Qu.: 0.0000 | Max. :43.0000 | NA |
| NUMBER.OF.MOTORIST.KILLED | Min. :0.00000 | 1st Qu.:0.00000 | Median :0.00000 | Mean :0.00055 | 3rd Qu.:0.00000 | Max. :5.00000 | NA |
| CONTRIBUTING.FACTOR.VEHICLE.1 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING.FACTOR.VEHICLE.2 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING.FACTOR.VEHICLE.3 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING.FACTOR.VEHICLE.4 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING.FACTOR.VEHICLE.5 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| COLLISION_ID | Min. : 22 | 1st Qu.:3046695 | Median :3584305 | Mean :3021392 | 3rd Qu.:4058626 | Max. :4533068 | NA |
| VEHICLE.TYPE.CODE.1 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE.TYPE.CODE.2 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE.TYPE.CODE.3 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE.TYPE.CODE.4 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE.TYPE.CODE.5 | Length:1896229 | Class :character | Mode :character | NA | NA | NA | NA |
#### Load Data
person_df <- read.csv('./Motor_Vehicle_Collisions_-_Person.csv', stringsAsFactors = FALSE) %>%
mutate(CRASH_DATE = as.Date(CRASH_DATE, "%m/%d/%Y")) #4,692,054 x 21
# person_df
# min(person_df$CRASH_DATE) #"2012-07-01"
# max(person_df$CRASH_DATE) #"2022-05-29"
kable(t(summary(person_df))) %>% kable_classic(full_width = TRUE, html_font = "Cambria", font_size = 14)
| UNIQUE_ID | Min. : 10922 | 1st Qu.: 6812186 | Median : 8996148 | Mean : 8531863 | 3rd Qu.:10216281 | Max. :12239058 | NA |
| COLLISION_ID | Min. : 37 | 1st Qu.:3638855 | Median :3921823 | Mean :3853306 | 3rd Qu.:4210666 | Max. :4533068 | NA |
| CRASH_DATE | Min. :2012-07-01 | 1st Qu.:2017-03-19 | Median :2018-06-08 | Mean :2018-07-08 | 3rd Qu.:2019-09-20 | Max. :2022-05-29 | NA |
| CRASH_TIME | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| PERSON_ID | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| PERSON_TYPE | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| PERSON_INJURY | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_ID | Min. : 123423 | 1st Qu.:17466247 | Median :18528882 | Mean :18253620 | 3rd Qu.:19125401 | Max. :20229580 | NA’s :185684 |
| PERSON_AGE | Min. :-999.0 | 1st Qu.: 23.0 | Median : 35.0 | Mean : 36.8 | 3rd Qu.: 50.0 | Max. :9999.0 | NA’s :453265 |
| EJECTION | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| EMOTIONAL_STATUS | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| BODILY_INJURY | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| POSITION_IN_VEHICLE | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| SAFETY_EQUIPMENT | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| PED_LOCATION | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| PED_ACTION | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| COMPLAINT | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| PED_ROLE | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING_FACTOR_1 | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING_FACTOR_2 | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
| PERSON_SEX | Length:4692054 | Class :character | Mode :character | NA | NA | NA | NA |
#### Load Data
vehicles_df <- read.csv('./Motor_Vehicle_Collisions_-_Vehicles.csv', stringsAsFactors = FALSE) %>%
mutate(CRASH_DATE = as.Date(CRASH_DATE, "%m/%d/%Y")) #3,704,406 x 25
# vehicles_df
# min(vehicles_df$CRASH_DATE) #"2012-07-01"
# max(vehicles_df$CRASH_DATE) #"2021-12-04"
kable(t(summary(vehicles_df))) %>% kable_classic(full_width = TRUE, html_font = "Cambria", font_size = 14)
| UNIQUE_ID | Min. : 111711 | 1st Qu.:14215160 | Median :17306058 | Mean :16060871 | 3rd Qu.:18739205 | Max. :20121717 | NA |
| COLLISION_ID | Min. : 22 | 1st Qu.:3017853 | Median :3567068 | Mean :2996659 | 3rd Qu.:4028145 | Max. :4484197 | NA |
| CRASH_DATE | Min. :2012-07-01 | 1st Qu.:2014-10-15 | Median :2016-11-18 | Mean :2016-11-21 | 3rd Qu.:2018-11-15 | Max. :2021-12-04 | NA |
| CRASH_TIME | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_ID | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| STATE_REGISTRATION | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_TYPE | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_MAKE | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_MODEL | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_YEAR | Min. : 1000 | 1st Qu.: 2008 | Median : 2013 | Mean : 2015 | 3rd Qu.: 2016 | Max. :20063 | NA’s :1796971 |
| TRAVEL_DIRECTION | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_OCCUPANTS | Min. :0.00e+00 | 1st Qu.:1.00e+00 | Median :1.00e+00 | Mean :1.01e+03 | 3rd Qu.:1.00e+00 | Max. :1.00e+09 | NA’s :1718406 |
| DRIVER_SEX | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| DRIVER_LICENSE_STATUS | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| DRIVER_LICENSE_JURISDICTION | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| PRE_CRASH | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| POINT_OF_IMPACT | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_DAMAGE | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_DAMAGE_1 | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_DAMAGE_2 | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| VEHICLE_DAMAGE_3 | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| PUBLIC_PROPERTY_DAMAGE | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| PUBLIC_PROPERTY_DAMAGE_TYPE | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING_FACTOR_1 | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
| CONTRIBUTING_FACTOR_2 | Length:3704406 | Class :character | Mode :character | NA | NA | NA | NA |
monthly_agg1 <- (person_df %>%
mutate(yr_mo = paste0(substr(CRASH_DATE,1,4),'-',substr(CRASH_DATE,6,7))) %>%
group_by(yr_mo) %>%
summarise(n = n()) %>%
arrange(yr_mo)
)
plot_ly(x=monthly_agg1$yr_mo, y=monthly_agg1$n, type='bar')
Need to use adjust timeframe due to missing data. Using data after 2017-01-01 for full year modeling
combined_df <- (crashes_df %>%
select(-c(CRASH.DATE, CRASH.TIME, CONTRIBUTING.FACTOR.VEHICLE.1, CONTRIBUTING.FACTOR.VEHICLE.2,
CONTRIBUTING.FACTOR.VEHICLE.3, CONTRIBUTING.FACTOR.VEHICLE.4, CONTRIBUTING.FACTOR.VEHICLE.5,
VEHICLE.TYPE.CODE.4, VEHICLE.TYPE.CODE.5,
NUMBER.OF.PEDESTRIANS.INJURED, NUMBER.OF.PEDESTRIANS.KILLED,
NUMBER.OF.CYCLIST.INJURED, NUMBER.OF.CYCLIST.KILLED,
ON.STREET.NAME, CROSS.STREET.NAME, OFF.STREET.NAME)) %>%
inner_join(vehicles_df %>%
filter(!is.na(VEHICLE_ID)) %>%
select(-c(CRASH_DATE, CRASH_TIME, VEHICLE_ID))
, by = 'COLLISION_ID') %>%
inner_join(person_df %>%
select(-c(UNIQUE_ID, CONTRIBUTING_FACTOR_1, CONTRIBUTING_FACTOR_2))
, by= c('COLLISION_ID'='COLLISION_ID', 'UNIQUE_ID'='VEHICLE_ID')) %>%
filter((PED_ROLE %in% c('Driver'))) %>% #drivers only
filter((CRASH_DATE >= '2017-01-01') & (CRASH_DATE < '2021-12-01')) %>% #only from 2017-01-01 to 2021-11-30
filter(PERSON_AGE > 14 & PERSON_AGE< 101) %>%
filter(PERSON_SEX == 'F' | PERSON_SEX=='M') %>%
mutate(MONTH = substr(CRASH_DATE,6,7)) %>%
mutate(TIMESTEP = as.period(interval(as.Date('2017-01-01'), CRASH_DATE)) %/% months(1)) %>%
mutate(yr_mo = paste0(substr(CRASH_DATE,1,4),'-',substr(CRASH_DATE,6,7))) %>%
select(COLLISION_ID, CRASH_DATE, PERSON_AGE, PERSON_SEX, MONTH, TIMESTEP, yr_mo) %>%
arrange(CRASH_DATE)
)
# min(combined_df$CRASH_DATE) #"2017-01-01"
# max(combined_df$CRASH_DATE) #"2021-11-30"
Filtered un-necessary data and selected columns of interest
monthly_agg2 <- (combined_df %>%
group_by(TIMESTEP, yr_mo, MONTH) %>%
summarise(n = n(), .groups = 'drop') %>%
arrange(TIMESTEP, yr_mo, MONTH)
)
plot_ly(x=monthly_agg2$yr_mo, y=monthly_agg2$n, type='bar')
Notice the huge COVID-19 Impact vehicle repair volumes that’s visible in this dataset!
monthly_agg3 <- (combined_df %>%
group_by(TIMESTEP, yr_mo, MONTH, PERSON_SEX, PERSON_AGE) %>%
summarise(n = n(), .groups = 'drop') %>%
arrange(TIMESTEP, yr_mo, MONTH, PERSON_SEX, PERSON_AGE)
)
train_df <- monthly_agg3 %>% filter(TIMESTEP <= 47) # <= 2020-12
test_df <- monthly_agg3 %>% filter(TIMESTEP > 47) # > 2020-12
lm_model <- lm(data = train_df, formula = n ~ TIMESTEP + MONTH + PERSON_AGE + PERSON_SEX ) #0.5457
summary(lm_model)
##
## Call:
## lm(formula = n ~ TIMESTEP + MONTH + PERSON_AGE + PERSON_SEX,
## data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -441.83 -64.83 4.23 58.78 299.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 352.82598 5.76802 61.169 < 2e-16 ***
## TIMESTEP -2.92530 0.09545 -30.646 < 2e-16 ***
## MONTH02 -10.16351 6.23069 -1.631 0.102889
## MONTH03 5.85024 6.23280 0.939 0.347955
## MONTH04 -17.30059 6.27380 -2.758 0.005837 **
## MONTH05 13.68088 6.25906 2.186 0.028863 *
## MONTH06 19.15099 6.25343 3.062 0.002203 **
## MONTH07 13.20462 6.25123 2.112 0.034691 *
## MONTH08 12.80755 6.26343 2.045 0.040908 *
## MONTH09 18.12040 6.27354 2.888 0.003883 **
## MONTH10 27.72229 6.28431 4.411 1.04e-05 ***
## MONTH11 22.61062 6.30315 3.587 0.000336 ***
## MONTH12 23.03260 6.32483 3.642 0.000273 ***
## PERSON_AGE -3.92660 0.05504 -71.344 < 2e-16 ***
## PERSON_SEXM 150.90222 2.55012 59.175 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 111.3 on 7619 degrees of freedom
## Multiple R-squared: 0.5457, Adjusted R-squared: 0.5449
## F-statistic: 653.8 on 14 and 7619 DF, p-value: < 2.2e-16
SST_lm_train <- sum((train_df$n - mean(train_df$n))^2)
SSE_lm_train <- sum((train_df$n - predict(lm_model, newdata = train_df))^2)
R_sq_lm_train <- 1-(SSE_lm_train/SST_lm_train)
R_sq_lm_train
## [1] 0.5457191
plot_ly(x=train_df$yr_mo, y=(predict(lm_model, newdata = train_df)-train_df$n), type='scatter', mode='markers') %>%
layout(title = 'Plot of Linear Model Residuals vs yr_mo for Training Data',
xaxis = list(title = 'yr_mo'),
yaxis = list(title = 'LM Residuals', rangemode = "tozero"))
SST_lm_test <- sum((test_df$n - mean(test_df$n))^2)
SSE_lm_test <- sum((test_df$n - predict(lm_model, newdata = test_df))^2)
R_sq_lm_test <- 1-(SSE_lm_test/SST_lm_test)
R_sq_lm_test
## [1] 0.08364779
plot_ly(x=test_df$yr_mo, y=(predict(lm_model, newdata = test_df)-test_df$n), type='scatter', mode='markers') %>%
layout(title = 'Plot of Linear Model Residuals vs yr_mo for Test Data',
xaxis = list(title = 'yr_mo'),
yaxis = list(title = 'LM Residuals', rangemode = "tozero"))
set.seed(12345)
rf_model <- randomForest(n ~ TIMESTEP + MONTH + PERSON_AGE + PERSON_SEX, data=train_df, importance=TRUE, ntree=110) #
summary(rf_model)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 7634 -none- numeric
## mse 110 -none- numeric
## rsq 110 -none- numeric
## oob.times 7634 -none- numeric
## importance 8 -none- numeric
## importanceSD 4 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 7634 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
var_importance_df <- data.frame(importance(rf_model)) %>% rename('PCT_IncMSE'='X.IncMSE') %>% arrange(desc(PCT_IncMSE))
var_importance_df
## PCT_IncMSE IncNodePurity
## PERSON_SEX 47.946796 48776557
## PERSON_AGE 23.222092 77649096
## TIMESTEP 21.783257 15641249
## MONTH 9.431747 1125982
SST_rf_train <- sum((train_df$n - mean(train_df$n))^2)
SSE_rf_train <- sum((train_df$n - predict(rf_model, newdata = train_df))^2)
R_sq_rf_train <- 1-(SSE_rf_train/SST_rf_train)
R_sq_rf_train
## [1] 0.8639321
plot_ly(x=seq(1,length(rf_model$mse),1), y=rf_model$mse, mode='lines+markers', type='scatter') %>%
layout(title = 'Plot of Random Forest Training Error vs Number of Trees',
xaxis = list(title = 'Number of Trees'),
yaxis = list(title = 'Error', rangemode = "tozero"))
plot_ly(x=train_df$yr_mo, y=(predict(rf_model, newdata = train_df)-train_df$n), type='scatter', mode='markers') %>%
layout(title = 'Plot of Random Forest Model Residuals vs yr_mo',
xaxis = list(title = 'yr_mo'),
yaxis = list(title = 'RF Residuals', rangemode = "tozero"))
SST_rf_test <- sum((test_df$n - mean(test_df$n))^2)
SSE_rf_test <- sum((test_df$n - predict(rf_model, newdata = test_df))^2)
R_sq_rf_test <- 1-(SSE_rf_test/SST_rf_test)
R_sq_rf_test
## [1] 0.7582554
plot_ly(x=test_df$yr_mo, y=(predict(rf_model, newdata = test_df)-test_df$n), type='scatter', mode='markers') %>%
layout(title = 'Plot of Random Forest Model Residuals vs yr_mo for Test Data',
xaxis = list(title = 'yr_mo'),
yaxis = list(title = 'RF Residuals', rangemode = "tozero"))
set.seed(12345)
min_leaf <- 3
min_split <- 3*min_leaf
cart_model <- rpart(n ~ TIMESTEP + MONTH + PERSON_AGE + PERSON_SEX, data=train_df,
control = c(minsplit = min_split, minbucket = min_leaf, cp=0.01)) #default = 0.01
SST_CART_train <- sum((train_df$n - mean(train_df$n))^2)
SSE_CART_train <- sum((train_df$n - predict(cart_model, newdata = train_df))^2)
R_sq_cart_train <- 1-(SSE_CART_train/SST_CART_train)
R_sq_cart_train
## [1] 0.9023338
data.frame(Variable_Importance = cart_model$variable.importance,
Variable_Importance_Pct_Tot = round(100*cart_model$variable.importance/sum(cart_model$variable.importance),0))
## Variable_Importance Variable_Importance_Pct_Tot
## PERSON_AGE 104059304 55
## PERSON_SEX 58373992 31
## TIMESTEP 25102534 13
rpart.plot(cart_model)
plot_ly(x=train_df$yr_mo, y=(predict(cart_model, newdata = train_df)-train_df$n), type='scatter', mode='markers') %>%
layout(title = 'Plot of CART Model Residuals vs yr_mo',
xaxis = list(title = 'yr_mo'),
yaxis = list(title = 'CART Residuals', rangemode = "tozero"))
SST_cart_test <- sum((test_df$n - mean(test_df$n))^2)
SSE_cart_test <- sum((test_df$n - predict(cart_model, newdata = test_df))^2)
R_sq_cart_test <- 1-(SSE_cart_test/SST_cart_test)
R_sq_cart_test
## [1] 0.65263
plot_ly(x=test_df$yr_mo, y=(predict(cart_model, newdata = test_df)-test_df$n), type='scatter', mode='markers') %>%
layout(title = 'Plot of CART Model Residuals vs yr_mo for Test Data',
xaxis = list(title = 'yr_mo'),
yaxis = list(title = 'CART Residuals', rangemode = "tozero"))
plot_ly(x=train_df$n, y=(predict(cart_model, newdata = train_df)), type='scatter', mode='markers', name='CART', alpha=0.3) %>%
add_trace(x=train_df$n, y=(predict(rf_model, newdata = train_df)), type='scatter', mode='markers', name='RF', alpha=0.3) %>%
add_trace(x=train_df$n, y=(predict(lm_model, newdata = train_df)), type='scatter', mode='markers', name='LM', alpha=0.3) %>%
add_trace(x=c(0,500), y=c(0,500), type='scatter', mode='line', name='Perfect', alpha=1) %>%
layout(title = 'Plot of All Model Predictions vs Actual Values - Training Data',
xaxis = list(title = 'Actual Value', range=c(0,700)),
yaxis = list(title = 'Model Prediction', rangemode = "tozero"))
plot_ly(x=test_df$n, y=(predict(cart_model, newdata = test_df)), type='scatter', mode='markers', name='CART', alpha=0.3) %>%
add_trace(x=test_df$n, y=(predict(rf_model, newdata = test_df)), type='scatter', mode='markers', name='RF', alpha=0.3) %>%
add_trace(x=test_df$n, y=(predict(lm_model, newdata = test_df)), type='scatter', mode='markers', name='LM', alpha=0.3) %>%
add_trace(x=c(0,300), y=c(0,300), type='scatter', mode='line', name='Perfect', alpha=1) %>%
layout(title = 'Plot of All Model Predictions vs Actual Values - Test Data',
xaxis = list(title = 'Actual Value', range=c(0,325)),
yaxis = list(title = 'Model Prediction', rangemode = "tozero"))
kable(data.frame(Model_Type = c('Linear', 'Random Forest', 'CART'),
R_sq_train = round(c(R_sq_lm_train, R_sq_rf_train, R_sq_cart_train), 2),
R_sq_test = round(c(R_sq_lm_test, R_sq_rf_test, R_sq_cart_test), 2))) %>%
kable_classic(full_width = FALSE, html_font = "Cambria")%>%
row_spec(2, background = c("yellow"))
| Model_Type | R_sq_train | R_sq_test |
|---|---|---|
| Linear | 0.55 | 0.08 |
| Random Forest | 0.86 | 0.76 |
| CART | 0.90 | 0.65 |
=> Moving forward with the Random Forest Model
#### Assume NYC Collision Shop has 1% of Total market share - Estimate their future monthly car volume
Shop_Market_Share <- 1/100
#Compute Actual from Test Data
test_agg_df <- test_df %>% group_by(MONTH) %>% summarise(Actual_NYC_Collisions=sum(n), Actual_Shop_Volume=round(Shop_Market_Share*sum(n), 0))
#Create dataset for future year (2021) ... maybe 2022 ...decide later with team (2021 allows comparison to real volume events)
temp1 <- data.frame(TIMESTEP = c(48:59))
temp2 <- data.frame(MONTH = c('01','02','03','04','05','06','07','08','09','10','11','12'))
temp3 <- data.frame(PERSON_AGE = c(15:100))
temp4 <- data.frame(PERSON_SEX = c('M','F'))
monthly_predictions_df <- temp1 %>% bind_cols(temp2) %>% full_join(temp3, by=character()) %>% full_join(temp4, by=character())
#Create predictions
monthly_predictions_df$predictions <- predict(rf_model, newdata = monthly_predictions_df)
monthly_predictions_agg_df <- (monthly_predictions_df %>%
group_by(MONTH) %>%
summarise(Predicted_NYC_Collisions=round(sum(predictions), 0)) %>%
mutate(Predicted_Shop_Volume = round(Shop_Market_Share*Predicted_NYC_Collisions, 0)) %>%
left_join(test_agg_df, by='MONTH') %>%
mutate(YEAR = 2021) %>%
select(YEAR, MONTH, Actual_NYC_Collisions, Actual_Shop_Volume, Predicted_NYC_Collisions, Predicted_Shop_Volume)
)
kable(monthly_predictions_agg_df) %>%
kable_classic(full_width = FALSE, html_font = "Cambria")
| YEAR | MONTH | Actual_NYC_Collisions | Actual_Shop_Volume | Predicted_NYC_Collisions | Predicted_Shop_Volume |
|---|---|---|---|---|---|
| 2021 | 01 | 9671 | 97 | 16482 | 165 |
| 2021 | 02 | 8432 | 84 | 15864 | 159 |
| 2021 | 03 | 10648 | 106 | 15574 | 156 |
| 2021 | 04 | 11501 | 115 | 12726 | 127 |
| 2021 | 05 | 13394 | 134 | 14655 | 147 |
| 2021 | 06 | 13745 | 137 | 14868 | 149 |
| 2021 | 07 | 12645 | 126 | 15234 | 152 |
| 2021 | 08 | 12473 | 125 | 15381 | 154 |
| 2021 | 09 | 12623 | 126 | 15488 | 155 |
| 2021 | 10 | 13097 | 131 | 15758 | 158 |
| 2021 | 11 | 11522 | 115 | 15324 | 153 |
| 2021 | 12 | NA | NA | 15081 | 151 |
plot_ly(x=monthly_predictions_agg_df$MONTH, y=monthly_predictions_agg_df$Actual_Shop_Volume,
type='bar', name='Actual_Shop_Volume') %>%
add_trace(x=monthly_predictions_agg_df$MONTH, y=monthly_predictions_agg_df$Predicted_Shop_Volume,
type='bar', name='Predicted_Shop_Volume') %>%
layout(title = 'Plot of Actual_Shop_Volume and Predicted_Shop_Volume (Test Set)',
xaxis = list(title = 'Months in 2021'),
yaxis = list(title = 'Car volume'))
kable(train_df %>%
group_by(PERSON_SEX) %>%
summarise(n = sum(n)) %>%
arrange(PERSON_SEX)) %>%
kable_classic(full_width = FALSE, html_font = "Cambria")
| PERSON_SEX | n |
|---|---|
| F | 313706 |
| M | 895649 |
kable(train_df %>%
mutate(age_group = 5*floor(PERSON_AGE/5)) %>%
group_by(age_group) %>%
summarise(n = sum(n)) %>%
arrange(age_group)) %>%
kable_classic(full_width = FALSE, html_font = "Cambria")
| age_group | n |
|---|---|
| 15 | 26719 |
| 20 | 113808 |
| 25 | 161005 |
| 30 | 155614 |
| 35 | 137152 |
| 40 | 122047 |
| 45 | 116100 |
| 50 | 111292 |
| 55 | 98774 |
| 60 | 73487 |
| 65 | 44235 |
| 70 | 25668 |
| 75 | 13053 |
| 80 | 6627 |
| 85 | 2720 |
| 90 | 834 |
| 95 | 197 |
| 100 | 23 |
Use the data below to determine how much volume can be gained by an advertising campaign targeting a certain demographic
Link
Across all industries, the average CTR for a search ad is 1.91%, and 0.35% for a display ad.
=> Determine cost of a promotion and then outline the potential increase in marketshare for that demographic